* Replication code for Hirst & Robertson (2022)
* File 1 of 4 
* This file was written for Stata 16.1 and has not been tested on any other version.
* Please read the README file before continuing. 

clear all
cd "C:\your\data\directory\" //Change this to where you are storing the files
local date FINAL

/*DATA IN
The following must be in the "Raw" subdirectory of the data directory,
or the file locations must be changed manually in the code:
"Raw\master_2003_Q1.idx" - "Raw\master_2020_Q4.idx" //Raw index files from EDGAR
"Raw\ISS_data.dta" \\ISS data (from WRDS)
"Raw\CIK_CUSIP.csv" \\CIK Cusip crosswalk from Ekaterina Volkova: https://sites.google.com/view/evolkova/data-cik-cusip-link 
"Raw\CRSP_MF_2003.dta" - "Raw\CRSP_MF_2020.dta" //CRSP muitual fund holdings data (from WRDS)
"Raw\Compustat_2000_2020.dta" \\Compustat (from WRDS)
"Raw\CRSP_2002_2019.dta" \\CRSP (from WRDS)
*/


/* Combine Raw EDGAR Files*/
if 1 == 1 {
	forvalues i = 2002 / 2020 {
		forvalues j = 1/4 {
			di as error "This is master_`i'_Q`j'"
			import delimited "Raw\master_`i'_Q`j'.idx", varnames(10) rowrange(12) clear
			gen date = date(datefiled, "YMD")
			format date %td
			gen quarter = qofd(date)
			format quarter %tq
			assert quarter == tq(`i'q`j')
			gen prelim_proxy = 0
			foreach val in "PRE 14A" "PRE 14C" "PREC14A" "PREM14A" "PREM14C" ///
				"PREN14A" "PRER14A" "PRER14C" "PRRN14A" {
				replace prelim_proxy = 1 if formtype == "`val'"
				}
			gen def_proxy = 0
			foreach val in "DEF 14A" "DEF 14C" "DEFA14A" "DEFA14C" "DEFC14A" ///
				"DEFM14A"  "DEFN14A" "DEFM14C" "DEFR14A" "DEFR14C" "DFAN14A" {
				replace def_proxy = 1 if formtype == "`val'"
				}
			gen proxy = 0
			replace proxy = 1 if def_proxy== 1 | prelim_proxy == 1
			keep if proxy == 1 
			tempfile master_`i'_Q`j'
			save `master_`i'_Q`j'', replace
*			sleep 500
			}
		}
		
	clear
	forvalues i = 2002 / 2020 {
		forvalues j = 1/4 {
			append using `master_`i'_Q`j''
			}
		}
	gen year = year(date)
	destring cik, replace
	save "Edgar_2003_2020.dta", replace
	
	*Extract CIK x years*
	keep cik year 
	duplicates drop 
	save "Edgar_cik_year.dta", replace //ISS identifiers are cusip and ticker -- a crosswalk is necessary from cik
	}

/* Augment ISS with Institutional Holdings and MF Holdings */
if 1 == 1 {
	//First tidy up CRSP data  
	if 0 == 1 {
		 use "Raw\CRSP_2002_2019.dta" , clear
		 drop ACCOMP ACPERM ALTPRC ALTPRCDT ASK ASKHI BID BIDLO CFACPR CFACSHR DCLRDT DISTCD DIVAMT DLAMT DLPDT DLPRC DLRET DLRETX DLSTCD FACPR FACSHR HSICCD HSICIG HSICMG ISSUNO MMCNT NAICS NAMEENDT NEXTDT NMSIND NSDINX NWPERM PAYDT RCRDDT RETX SHRENDDT SHRFLG SPREAD TRTSCD TSYMBOL ewretd ewretx sprtrn vwretd vwretx
		 duplicates drop
		 save "Raw\CRSP_2002_2019_short.dta" , replace
		}

	
	//Start with CRSP MF Holdings - prep the files
	*Import raw files and extract the total shares held by MFs on a given date
	if 1 == 1 {
		foreach year in 2002_2003 2004_2005 2006_2007 2008 2009 2010 2011 2012 2013 ///
			2014 2015 2016 2017 2018 2019 2020  {  
			use "Raw\CRSP_MF_`year'.dta", clear
	 
			*Some funds report monthly, others quarterly. We keep the first report per fund x quarter
			gen quarter = qofd(report_dt)
			format quarter %tq 
			egen last_date = max(report_dt), by(quarter crsp_portno) // Identify the last date in each fund x quarter 
			gen _keep = 1 if report_dt == last_date	// Mark filings from that date 
			keep if _keep == 1 // Keep only those filings 
			drop last_date _keep quarter 
			
			*for each permno x month, compute total number of shares held 
			keep report_dt nbr_shares permno
			collapse (sum) nbr_shares, by (permno report_dt)
			label var nbr_shares "Total number of shares held by MFs this period"
			tempfile CRSP_`year'
			save "Raw\CRSP_`year'", replace 
			}
		}
	clear
	*Combine the annual files
	foreach year in 2002_2003 2004_2005 2006_2007 2008 2009 2010 2011 2012 2013 ///
		2014 2015 2016 2017 2018 2019 2020 { 
		append using "Raw\CRSP_`year'" 
		}
	*Clean up any duplicates. These are probably late reports -- they should be added up  
	collapse (sum) nbr_shares, by (permno report_dt)
	label var nbr_shares "Total number of shares held by MFs this period"
	rename report_dt date 
	
	rename permno PERMNO 
	
	*Now we need to merge in CRSP total shares outstanding so that we can turn this into a percentage
	merge 1:1 PERMNO date using "Raw\CRSP_2002_2019_short.dta" 

	gen cusip8 = substr(NCUSIP, 1,8)
	drop if cusip8 == ""
	drop if _merge == 2 
	drop _merge
	duplicates r  cusip8 date // confirm no duplicates by cusip8 date 
	gen month = mofd(date)
	duplicates r  cusip8 month // confirm no duplicates by cusip8 month 

	/*. duplicates report month cusip8

	Duplicates in terms of month cusip8

	--------------------------------------
	   copies | observations       surplus
	----------+---------------------------
			1 |      725917              0
	--------------------------------------
	*/
	*Now we will need to convert this to quarterly 
		//This is a bit messy, since the reporting is now quarterly BUT not all MFs report in the same 
		//month and the number of shares could change within a quarter. So we will compute 
		//the percentage reported per month, and just add this up by quarter. 
 	
	gen pct_MF = (nbr_shares / SHROUT) / 1000 
	gen quarter = qofd(date) 
	format quarter %tq 
	gen SHRCD_1 = 0
	replace SHRCD_1 = 1 if SHRCD >= 10 & SHRCD <=19
	gen SHRCD_7 = 0
	replace SHRCD_7 = 1 if SHRCD >= 70 & SHRCD <=79
	gen SHRCD_04 = 0 
	foreach i in 14 15 24 25 34 35 44 45 74 75 {
		replace SHRCD_04 = 1 if SHRCD == `i' 
		}
	gen SHRCD_10_11 = 0
	replace SHRCD_10_11 = 1 if SHRCD >= 10 & SHRCD <=11	
	foreach i in "N" "A" "Q" "X" "R" {
		gen PRIMEXCH_`i' = 0 
		replace PRIMEXCH_`i' = 1 if PRIMEXCH == "`i'"
		}
	replace  PRIMEXCH_X = 1 if PRIMEXCH == "B"	
	replace  PRIMEXCH_X = 1 if PRIMEXCH == "I"	
	
	collapse (sum) pct_MF (mean) SHRCD_1 SHRCD_7 SHRCD_10_11 SHRCD_04 PRIMEXCH_*, by (cusip8 quarter)	
	label var PRIMEXCH_N "NYSE"
	label var PRIMEXCH_A "NYSE American"
	label var PRIMEXCH_Q "NASDAQ"
	label var PRIMEXCH_X "Other"
	label var PRIMEXCH_R "ARCA"
	
	label var pct_MF "Percentage held by MF"
	tempfile MF_temp 
	save `MF_temp', replace 
*	save "MF_temp", replace 

	
	*Start with MF 
	use "Raw\ISS_data.dta", clear
	gen quarter = qofd(MeetingDate)
	format quarter %tq
	gen cusip8 = substr(CUSIP,1,8)
	merge m:1 cusip8 quarter using `MF_temp'
*	merge m:1 cusip8 quarter using "MF_temp"
 	
	gen year = year(dofq(quarter))
	drop if year < 2003
	
	/* 
	. tab _merge  
                 _merge |      Freq.     Percent        Cum.
------------------------+-----------------------------------
        master only (1) |    160,317       16.10       16.10
         using only (2) |    326,905       32.82       48.92
            matched (3) |    508,772       51.08      100.00
------------------------+-----------------------------------
                  Total |    995,994      100.00
	*/
	//What we care about is how many firm x quarters from ISS are missing from the holdings data
	preserve 
		keep cusip8 quarter _merge year
		duplicates drop
		tab _merge // We don't care about _merge == 2
		tab _merge if _merge != 2 
	/* . tab _merge if _merge != 2  

                 _merge |      Freq.     Percent        Cum.
------------------------+-----------------------------------
        master only (1) |     22,669       27.23       27.23
            matched (3) |     60,567       72.77      100.00
------------------------+-----------------------------------
                  Total |     83,236      100.00

	*/
		restore
	gen _merge_MF = 1
	replace _merge_MF = 0 if _merge == 1
	drop _merge 
	

*	rename InstOwn_Perc pct_Inst 
	save "ISS_augmented.dta", replace
	}
  
/* Merge EDGAR with ISS */
if 1 == 1 {
	//Extract CIK, cusip and ticker from Compustat 
	use "Raw\Compustat_2000_2020.dta", clear
	gen year = year(datadate)
	keep cik tic cusip year
	destring cik, replace
	drop if cik == .
	drop if cusip == ""
	duplicates drop 
	
	//Merge it with cik x year from EDGAR to figure out which to keep
	merge 1:m cik year using "Edgar_cik_year.dta"
	drop if year < 2002
	drop if year > 2020
	
	/* 
	tab _merge
	duplicates r cik year	
	distinct cik if _merge == 3
	distinct cik if _merge == 2
	distinct cik if _merge == 1

.         tab _merge

				 _merge |      Freq.     Percent        Cum.
------------------------+-----------------------------------
        master only (1) |     64,720       35.41       35.41 // Compustat only, can't match to EDGAR
         using only (2) |     28,922       15.82       51.23 // in EDGAR, can't match cik using Compustat
            matched (3) |     89,153       48.77      100.00
------------------------+-----------------------------------
                  Total |    182,795      100.00

	.         duplicates r cik year   

	Duplicates in terms of cik year

	--------------------------------------
	   copies | observations       surplus
	----------+---------------------------
			1 |       182795             0
	--------------------------------------

	.         distinct cik if _merge == 3

		   |        Observations
		   |      total   distinct
	-------+----------------------
	   cik |      89153      11355

	.         distinct cik if _merge == 2

		   |        Observations
		   |      total   distinct
	-------+----------------------
	   cik |      28922      12485 

	.         distinct cik if _merge == 1

		   |        Observations
		   |      total   distinct
	-------+----------------------
	   cik |      64720      12832


	*/
	// Put aside the ones that don't match. We will start with the ones that do match
	preserve 
		keep if _merge == 2 
		drop _merge cusip tic 
		tempfile tempfile1 
		save `tempfile1', replace //These are cik x years that didn't match to Compustat. 
			//We can use the other file for them, or some other means of matching them, later  
		restore 
	
	//Now merge this back into our EDGAR data. This gives us a mapping between cusip (for ISS) and cik (for EDGAR)
	keep if _merge == 3
	gen cusip8 = substr(cusip,1,8)
	gen cusip6 = substr(cusip,1,6)
	drop _merge
	rename cusip cusip_raw 
	merge 1:m cik year using "Edgar_2003_2020.dta"
 	
	/* tab _merge
    Result                           # of obs.
    -----------------------------------------
    not matched                       100,921
        from master                         0  (_merge==1)
        from using                    100,921  (_merge==2) // Votes that we can't match on cik using Compustat

    matched                           194,003  (_merge==3)
    -----------------------------------------
	*/ 

	keep if _merge == 3
	drop _merge 
	duplicates drop 
	tempfile cik_matched 
	save `cik_matched', replace 
	
	use `cik_matched', clear 

	tempfile temp 
	save `temp', replace
	
	// Merge this with ISS Meeting date - we merge by cusip6 x day
	use "ISS_augmented.dta" , clear
*	gen cusip8 = substr(CUSIP, 1, 8)
	gen cusip6 = substr(CUSIP, 1, 6)
*	gen year = year(Recorddate)
*	gen date = MeetingDate
  
	keep if SHRCD_10_11 == 1 | SHRCD_10_11  == . // drop securities that are not common equity
	keep if SHRCD_10_11 == 1 
	
	bysort MeetingID Recorddate cusip6 CompanyID MeetingType MeetingDate: keep if _n == 1 // Collapse the file to MeetingID level 

	
	bysort MeetingID Recorddate cusip6 CompanyID MeetingType (MeetingDate): keep if _n == 1 // There are two duplicates (same MeetingID and record date, different meeting date) -- pick the earlier. 
	/// These were both cases where the vote was "pending" the first time, and they had to vote a second time. They are also both mutual funds, so they should be dropped anyway. 
	duplicates r MeetingID // confirm that MeetingID is unique -- it is	
	/* 	Duplicates in terms of MeetingID
	Duplicates in terms of MeetingID
	--------------------------------------
	   copies | observations       surplus
	----------+---------------------------
			1 |        53302             0
	--------------------------------------
	*/
 
	keep MeetingID Recorddate cusip6 MeetingType MeetingDate  year PRIMEXCH_N PRIMEXCH_A PRIMEXCH_Q PRIMEXCH_X PRIMEXCH_R SHRCD_10_11 
	drop if cusip6 == ""
  		
	*Strategy for mapping a meeting to filings: We assign each MeetingID within a firm an identifier. Then we iterate over these identifiers. Each time, we merge on cusip6 to EDGAR data, and keep all filings that occured up to 180 days before the meeting date. Later, we append these files back together.  

	bysort cusip6: gen N = _n 
		qui sum N 
		local max = r(max)
	
	forvalues i = 1 / `max' {
		preserve 
			keep if N == `i'
			merge 1:m cusip6 using `temp'
			drop if _merge == 2 
			keep if date >= MeetingDate - 180 & date < MeetingDate
			label var date "Filing date"
			tempfile temp`i'
			save `temp`i'', replace
			restore
		}
	clear 
	forvalues i = 1 / `max' {
		append using `temp`i'' 
	}
	gen before_record = 0
	replace before_record = 1 if date < Recorddate
	egen proxy_before = max(before_record), by(MeetingID)
	preserve 
		bysort MeetingID: keep if _n == 1 
		sum proxy_before , det
		restore
	
	drop if MeetingID == .
	drop _merge
	duplicates drop
	
	save "MeetingLevel", replace 
	
	egen max_prelim = max(prelim_proxy), by(MeetingID)
	egen max_def = max(def_proxy), by(MeetingID)
	drop  formtype datefiled filename date quarter prelim_proxy def_proxy proxy before_record N
	duplicates drop
	
	duplicates drop MeetingID MeetingType Recorddate MeetingDate cusip6 proxy_before max_prelim max_def, force
	merge 1:m MeetingID using "ISS_augmented.dta" 
	keep if SHRCD_10_11 == 1 | SHRCD_10_11  == . // drop securities that are not common equity
	keep if _merge == 3 //NOTE: these are all the votes we can't match again
	keep if SHRCD_10_11 == 1 
	duplicates drop
	
	save "Voting_combined", replace
	keep cusip6 cusip8 
	duplicates drop 
	save "cusips_for_FIS", replace
	}

